library(tidyverse) # Core Libraries - dplyr, ggplot2
library(lubridate) # Library for manipulating date
library(skimr) # Library for summary statistic of the data
library(plotly) # Library for interactive plots
library(hrbrthemes) # A theme library
library(janitor) # Library for cleaning dataRead the CSV file into a tibble
download_speed_measurement_tbl <- read_csv('../data/download_speed_measurements.csv')
upload_speed_measurement_tbl <- read_csv('../data/upload_speed_measurements.csv')
details_for_each_person_tbl <- read_csv('../data/details_for_each_person.csv')Exploratory analysis and Cleaning: Check for duplicate rows in the data with the janitor package and generate an overview summary of the input data with the skimr package
Let’s check that we do not have duplicate rows
duplicate_download <- download_speed_measurement_tbl %>% get_dupes()
duplicate_upload <- upload_speed_measurement_tbl %>% get_dupes()
duplicate_users <- details_for_each_person_tbl %>% get_dupes()Summary statistics of the imported raw input files
Download_speed_mesurement_tbl data frame broad overview. For the purpose of simplifying skim() function reporting output , let’s display only the variables with missing value; other information in the comment section below was derived from the full output of the skim() function
download_speed_measurement_tbl %>%
skimr::skim() %>%
filter(n_missing > 0)| Name | Piped data |
| Number of rows | 191200 |
| Number of columns | 4 |
| _______________________ | |
| Column type frequency: | |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| measured_download_speed_in_Mbps | 37739 | 0.8 | 89.12 | 96 | 0 | 13.42 | 33.8 | 183.31 | 300 | ▇▁▁▂▂ |
Upload_speed_mesurement_tbl data frame broad overview. For the purpose of simplifying skim() function reporting output , let’s display only the variables with missing value; other information in the comment section below was derived from the full output of the skim() function The code result is not shown to save space on this report and the output is summarized in the comment section below
upload_speed_measurement_tbl %>%
skimr::skim()%>%
filter(n_missing > 0)details_for_each_person_tbl data frame broad overview. For the purpose of simplifying skim() function reporting output , let’s display only the variables with missing value; other information in the comment section below was derived from the full output of the skim() function The code result is not shown to save space on this report and the output is summarized in the comment section below
details_for_each_person_tbl %>%
skimr::skim() %>%
filter(n_missing > 0)From the DATA_DICTIONARY.md we observe that download_speed_measurement_tbl and details_for_each_person_tbl have a column in common, the person_id column From the DATA_DICTIONARY.md we observe that upload_speed_measurement_tbl and details_for_each_person_tbl have a column in common, the person_id column
For simplicity we will merge the data in two separate step as follows.
download_speed_user_detail_tbl <- download_speed_measurement_tbl %>%
# Join by the common column (person_id)
left_join(details_for_each_person_tbl,
by = c("person_id" = "person_id"))
upload_speed_user_detail_tbl <- upload_speed_measurement_tbl %>%
# Join by person_id the common
left_join(details_for_each_person_tbl,
by = c("person_id" = "person_id"))We are require to filter the data on:
Since we decided to process the download-user-details and upload-users_detail in two separate steps, to avoid repeating the similar processing steps we will define some functions to facilitate the processing.
Define three function:
# filter_by_city_date_test_status function
filter_by_city_date_test_status <-
function(data = data, # input data frame
date = '2020-12-31', # date: Default Date above which to select data
cities = c("Samsville", "Databury"), # cities: Default cities to filter data by
success = TRUE){ # Success: Whether the test was a success (TRUE or FALSE)
data <- data %>% filter(city %in% cities) %>%
# Filter on successful test
filter(did_test_complete_successfully == success) %>%
# Convert the character type time_of_measurement into
# datetime data type using ymd_hms from the lubridate package
mutate(time_of_measurement = ymd_hms(time_of_measurement)) %>%
# select rows where measurement was done after a specific date
filter(time_of_measurement > ymd(date))
return (data)
}
# calculate_speed_percentile_perunit_time function
calculate_speed_percentile_perunit_time <-
function(data = data, # Input data frame
measurement = measured_download_speed_in_Mbps, # unquote column name of speed measurement(download/upload)
unit_time = "day", # Unit time to compute percentile from
probs = .60){ # quantile probability with value in [0,1]
# prepare the measurement to be used inside tidy methods
measurement_expr <- enquo(measurement)
if (as_label(measurement_expr) %in% colnames(data) ){
data <- data %>%
mutate(date = time_of_measurement %>%
floor_date(unit = unit_time)) %>%
group_by(person_id, date) %>%
# Calculate the mean upload or download speed per person per date(day)
summarize(average_measurement = mean(!!measurement_expr)) %>%
ungroup() %>%
group_by(person_id) %>%
# Calculate the nth percentile download or upload speed per user
summarize(mesurement_percentile = quantile(average_measurement,
probs= probs)) %>%
ungroup()
}
else{
stop(str_glue("measurement = {measurement_expr} is not a permitted option." ))
}
return(data)
}
# calculate_speed_average_per_variables function
calculate_speed_average_per_variables <-
function (data = data, # input data frame
..., # unquote variables to group by
measurement = measured_download_speed_in_Mbps){ # unquote column name of speed measurement
measurement_expr <- enquo(measurement)
groupby_vars_expr <- quos(...)
if (as_label(measurement_expr) %in% colnames(data) ){
data <- data %>%
group_by(!!! groupby_vars_expr) %>%
summarise(average_measurement = mean(!!measurement_expr)) %>%
ungroup()
}
else{
stop(str_glue("measurement = {measurement_expr} is not a permitted option." ))
}
return(data)
}Now let’s combine these data frame the Tidy way into a single table which has the structure person_id, city, type_of_broadband_connection, name_of_isp, average_download_speed, average_upload_speed, percentile60_download_speed
Filter download and upload measurement as per instruction
successfull_download_tbl <-
download_speed_user_detail_tbl %>%
# Filter data frame on given date, cities and measurement status
filter_by_city_date_test_status(date = '2020-12-31',
cities = c("Samsville", "Databury"),
success = TRUE)
successfull_upload_tbl <- upload_speed_user_detail_tbl %>%
filter_by_city_date_test_status(date = '2020-12-31',
cities = c("Samsville", "Databury"),
success = TRUE)Calculate and append average and percentiles
download_upload_speed_users_details_tbl <- successfull_download_tbl %>%
# Calculate the average download speed per users
calculate_speed_average_per_variables(person_id,
city,
type_of_broadband_connection,
name_of_isp) %>%
# Rename the average download speed column name
rename(average_download_speed_in_Mbps = average_measurement) %>%
# Calculate and append the average upload speed per user
left_join(successfull_upload_tbl %>%
calculate_speed_average_per_variables(
person_id ,
measurement = measured_upload_speed_in_Mbps) %>%
rename(average_upload_speed_in_Mbps=average_measurement)) %>%
# Optional: Calculate and append the 60th percentile download speed per day
left_join(successfull_download_tbl %>%
calculate_speed_percentile_perunit_time(
unit_time = "day",
measurement = measured_download_speed_in_Mbps)) %>%
# Rename the 60th percentile download speed column name
rename(percentile60_download_speed_in_Mbps = mesurement_percentile)download_upload_speed_users_details_tbl data frame broad overview. For the purpose of simplifying the skim() function reporting output , let’s display only the variable with missing values; other information in the comment section below was derived from the full output of the skim() function The code result is not shown to save space on this report but the output is summarized in the comment section below
download_upload_speed_users_details_tbl %>%
skimr::skim() %>%
filter(n_missing > 0)There are missing data in the raw input files above. However there are no missing data in the filtered and combined data frame (download_upload_speed_users_details_tbl). see comment section above
Relationship between download and upload speeds
download_upload_scatter_plot <- download_upload_speed_users_details_tbl %>%
# Data manipulation: Construct a label for interactive plot
mutate(label_text = str_glue('person_id = {person_id}
download = {round(percentile60_download_speed_in_Mbps,2)} Mbps
upload = {round(average_upload_speed_in_Mbps,2)} Mbps
broadband = {type_of_broadband_connection}
ISP = {name_of_isp}')) %>%
mutate(type_of_broadband_connection = type_of_broadband_connection %>%
fct_relevel("ADSL","VDSL","Fibre")) %>%
# Data Visualization
ggplot(aes(x = percentile60_download_speed_in_Mbps,
y = average_upload_speed_in_Mbps,
color = type_of_broadband_connection)) +
geom_jitter(aes(text = label_text), width =.1 , height = .1) +
scale_x_continuous(trans = "log10") +
scale_y_continuous(trans = "log10") +
facet_wrap(city ~ name_of_isp) +
labs(title = "Average Download Upload speed by ISP by City by Connection type",
x = "Average download speed in Mbps (log scale)",
y = "Average upload speed in Mbps (log scale)",
color = "Connection Type",
caption = "B. Alako: SamKnows Data analyst Technical Test") +
theme_ipsum()
# Interactive plot
ggplotly(download_upload_scatter_plot, tooltip = "text")# Identify theses mislabeled entries
fibre_labeled_as_adsl_tbl <- download_upload_speed_users_details_tbl %>%
filter(type_of_broadband_connection == "ADSL" ,
percentile60_download_speed_in_Mbps > 25) %>%
mutate(average_download_speed_in_Mbps = round(average_download_speed_in_Mbps,2))
adsl_labeled_as_fibre_tbl <- download_upload_speed_users_details_tbl %>%
filter(type_of_broadband_connection == "Fibre" ,
percentile60_download_speed_in_Mbps < 25) %>%
mutate(average_download_speed_in_Mbps = round(average_download_speed_in_Mbps,2))Fibre connection type labeled as ADSL connection type
fibre_labeled_as_adsl_tbl| person_id | city | type_of_broadband_connection | name_of_isp | average_download_speed_in_Mbps | average_upload_speed_in_Mbps | percentile60_download_speed_in_Mbps |
|---|---|---|---|---|---|---|
| 11560 | Samsville | ADSL | Useus | 195.92 | 78.16990 | 201.9052 |
| 28964 | Databury | ADSL | Fibrelicious | 195.60 | 75.88275 | 199.0207 |
| 35703 | Databury | ADSL | Fibrelicious | 215.77 | 72.09362 | 222.4196 |
| 40650 | Samsville | ADSL | Fibrelicious | 182.79 | 73.32865 | 184.9753 |
| 43120 | Databury | ADSL | Fibrelicious | 230.64 | 78.57891 | 237.4308 |
| 57069 | Samsville | ADSL | Useus | 230.99 | 79.45140 | 234.7654 |
ADSL connection type labeled as Fibre connection type
adsl_labeled_as_fibre_tbl| person_id | city | type_of_broadband_connection | name_of_isp | average_download_speed_in_Mbps | average_upload_speed_in_Mbps | percentile60_download_speed_in_Mbps |
|---|---|---|---|---|---|---|
| 12309 | Samsville | Fibre | Useus | 9.10 | 0.5509977 | 9.732604 |
| 33581 | Databury | Fibre | Useus | 9.45 | 0.5494934 | 9.818683 |
| 37494 | Samsville | Fibre | Useus | 9.16 | 0.5590463 | 9.492432 |
| 46558 | Databury | Fibre | Fibrelicious | 14.47 | 0.6969558 | 14.867715 |
| 53451 | Samsville | Fibre | Fibrelicious | 14.49 | 0.6903853 | 14.987488 |
| 68098 | Samsville | Fibre | Fibrelicious | 9.34 | 0.5612649 | 9.496197 |
Let’s correctly relabel mislabeled connection type
download_upload_speed_users_details_tbl <-
download_upload_speed_users_details_tbl %>%
mutate(type_of_broadband_connection = case_when(
type_of_broadband_connection == "ADSL" & percentile60_download_speed_in_Mbps > 25 ~ "Fibre",
type_of_broadband_connection == "Fibre" & percentile60_download_speed_in_Mbps < 25 ~ "ADSL",
TRUE ~ type_of_broadband_connection
))Difference in download speed between ISP for each connection type. Since the procedure to calculate the difference in download speed and difference in upload speed between ISP for each Connection type is the same, let’s define a function to facilitate this process
# A function to summarize the download or upload speed difference between ISP
speed_difference_btw_isp <-
function(data = data, # input data frame
..., # unquote variables to group by
measurement = percentile60_download_speed_in_Mbps # unquote column name of speed measurement
){
groupby_vars_expr <- quos(...)
measurement_expr <- enquo(measurement)
if (as_label(measurement_expr) %in% colnames(data) ){
data <- data %>%
select(type_of_broadband_connection,
name_of_isp, city,
!! measurement_expr) %>%
group_by(!!! groupby_vars_expr) %>%
summarize(mean_measurement = mean(!! measurement_expr)) %>%
ungroup() %>%
pivot_wider(names_from = name_of_isp,
values_from = mean_measurement) %>%
mutate(type_of_broadband_connection = type_of_broadband_connection %>%
fct_relevel("ADSL","VDSL","Fibre"))
}
else{
stop(str_glue("measurement = {measurement_expr} is not a permitted option." ))
}
return(data)
} Download speed difference and upload speed difference between ISP for each connection type
# Difference in Download between ISP by connection type
difference_in_download_btw_isp_tbl <- download_upload_speed_users_details_tbl %>%
speed_difference_btw_isp(type_of_broadband_connection,
name_of_isp,
measurement = percentile60_download_speed_in_Mbps ) %>%
mutate(Fibrelicious = round(Fibrelicious, 2),
Useus = round(Useus,2))
difference_in_download_btw_isp_tbl| type_of_broadband_connection | Fibrelicious | Useus |
|---|---|---|
| ADSL | 9.83 | 14.50 |
| Fibre | 224.20 | 192.46 |
| VDSL | 32.10 | 26.76 |
# Difference in Download between ISP by connection type by city
difference_in_download_btw_isp_by_city_tbl <-
download_upload_speed_users_details_tbl %>%
speed_difference_btw_isp(type_of_broadband_connection,
name_of_isp,
city,
measurement = percentile60_download_speed_in_Mbps ) %>%
mutate(Fibrelicious = round(Fibrelicious, 2),
Useus = round(Useus,2))
difference_in_download_btw_isp_by_city_tbl| type_of_broadband_connection | city | Fibrelicious | Useus |
|---|---|---|---|
| ADSL | Databury | 9.95 | 14.73 |
| ADSL | Samsville | 9.73 | 14.32 |
| Fibre | Databury | 219.15 | 186.18 |
| Fibre | Samsville | 232.50 | 198.03 |
| VDSL | Databury | 21.00 | 18.36 |
| VDSL | Samsville | 41.41 | 36.21 |
# Difference in upload speed between ISP by connection type
difference_in_upload_btw_isp_tbl <- download_upload_speed_users_details_tbl %>%
speed_difference_btw_isp(type_of_broadband_connection,
name_of_isp,
measurement = average_upload_speed_in_Mbps ) %>%
mutate(Fibrelicious = round(Fibrelicious, 2),
Useus = round(Useus,2))
difference_in_upload_btw_isp_tbl| type_of_broadband_connection | Fibrelicious | Useus |
|---|---|---|
| ADSL | 0.55 | 0.68 |
| Fibre | 73.75 | 75.26 |
| VDSL | 8.08 | 6.62 |
# Difference in Upload speed between ISP by connection type by city
difference_in_upload_btw_isp_by_city_tbl <-
download_upload_speed_users_details_tbl %>%
speed_difference_btw_isp(type_of_broadband_connection,
name_of_isp,
city,
measurement = average_upload_speed_in_Mbps ) %>%
mutate(Fibrelicious = round(Fibrelicious, 2),
Useus = round(Useus,2))
difference_in_upload_btw_isp_by_city_tbl| type_of_broadband_connection | city | Fibrelicious | Useus |
|---|---|---|---|
| ADSL | Databury | 0.57 | 0.69 |
| ADSL | Samsville | 0.54 | 0.67 |
| Fibre | Databury | 72.10 | 72.94 |
| Fibre | Samsville | 76.47 | 77.31 |
| VDSL | Databury | 5.36 | 4.56 |
| VDSL | Samsville | 10.36 | 8.94 |
Distribution of average download speed by ISP by city by connection type
# Data manipulation
download_upload_speed_distibution_plot <-
download_upload_speed_users_details_tbl %>%
select(type_of_broadband_connection,
name_of_isp,
city,
percentile60_download_speed_in_Mbps) %>%
mutate(type_of_broadband_connection = type_of_broadband_connection %>%
fct_relevel("ADSL","VDSL","Fibre")) %>%
# Data Visualization
ggplot(aes(x = percentile60_download_speed_in_Mbps,
fill = type_of_broadband_connection)) +
geom_histogram(binwidth = 2,
alpha = .7,
position ="identity") +
facet_wrap(name_of_isp~ city) +
labs(title = "Distribution of Average Download Speed by \nISP by City by Connection type",
x = "Average download speed (Mpbs)",
y = "Number of Users",
fill = "Connection Type",
caption = "B. Alako: SamKnows Data analyst Technical Test") +
theme_ipsum()
download_upload_speed_distibution_plotBetter/Worse download speed from Fibrelicious or from Useus, How much better/worse Let’s get the answer from the download-speed-between-ISP-by-city-and-by-connection-type data frame computed above
difference_in_download_btw_isp_by_city_tbl| type_of_broadband_connection | city | Fibrelicious | Useus |
|---|---|---|---|
| ADSL | Databury | 9.95 | 14.73 |
| ADSL | Samsville | 9.73 | 14.32 |
| Fibre | Databury | 219.15 | 186.18 |
| Fibre | Samsville | 232.50 | 198.03 |
| VDSL | Databury | 21.00 | 18.36 |
| VDSL | Samsville | 41.41 | 36.21 |
Let’s explore the overall download speed per hour per day by ISP by connection type We hope to derive from this exploration an insight of the expected download difference by ISP To achieve this let’s define a function to manipulate the data frame accordingly
# A function for data manipulation
download_speed_by_isp_per_hour <- function(data = data , isp_speed_difference = FALSE){
data_tbl <- data %>%
select(time_of_measurement,
measured_download_speed_in_Mbps,
type_of_broadband_connection,
name_of_isp) %>%
mutate(time_of_measurement = hour(time_of_measurement)) %>%
group_by(type_of_broadband_connection,
name_of_isp,
time_of_measurement) %>%
summarize(measured_download_speed_in_Mbps = mean(measured_download_speed_in_Mbps)) %>%
ungroup()
if (isp_speed_difference){
data_tbl <- data_tbl %>%
pivot_wider(names_from = name_of_isp,
values_from = measured_download_speed_in_Mbps) %>%
mutate(measured_download_speed_in_Mbps = abs(Fibrelicious - Useus)) %>%
mutate(label_text = str_glue("download speed = {round(measured_download_speed_in_Mbps,2)} Mbps"))
}
else{
data_tbl <- data_tbl %>%
mutate(label_text = str_glue("download speed = {round(measured_download_speed_in_Mbps,2)} Mbps"))
}
data_tbl <- data_tbl %>%
mutate(label_text = str_glue("Time: {time_of_measurement} o'clock
{label_text}")) %>%
mutate(type_of_broadband_connection = type_of_broadband_connection %>%
fct_relevel("ADSL","VDSL","Fibre"))
return(data_tbl)
}
# A function for visualizing the manipulated data
plot_download_by_isp <- function(data = data, isp_speed_difference = FALSE, legend_position= "none"){
download_speed_plot <- download_speed_by_isp_per_hour(data = data,
isp_speed_difference = isp_speed_difference ) %>%
# Data visualization
ggplot(aes(time_of_measurement,
measured_download_speed_in_Mbps,
text = label_text,
color = type_of_broadband_connection,
group = type_of_broadband_connection)) +
geom_point() +
geom_line() +
theme_ipsum() +
theme(legend.position = legend_position)
return(download_speed_plot)
} Download speed by ISP by connection type per hour of the day
# Relabel connection type as per observation above
successfull_download_tbl <- successfull_download_tbl %>%
mutate(type_of_broadband_connection = case_when(
type_of_broadband_connection == "ADSL" & measured_download_speed_in_Mbps > 25 ~ "Fibre",
type_of_broadband_connection == "Fibre" & measured_download_speed_in_Mbps < 25 ~ "ADSL",
TRUE ~ type_of_broadband_connection
))download_by_isp_plot <- successfull_download_tbl %>%
plot_download_by_isp() +
facet_wrap(name_of_isp ~ type_of_broadband_connection,
scale = "free", ncol=3) +
labs(title = "Download by ISP at different time of the day",
x = "Time of the day (hours)",
y = "Average download speed in Mbps",
fill = "Connection Type",
caption = "SamKnows Data analyst assessment test")
# Interactive plot
ggplotly(download_by_isp_plot, tooltip = "text") Download speed difference by ISP by connection type per hour of the day
difference_download_by_isp_plot <- successfull_download_tbl %>%
plot_download_by_isp(isp_speed_difference = TRUE) +
facet_wrap(~type_of_broadband_connection,
scale = "free", ncol=3) +
labs(title = "Difference in Download speed by ISP at different time of the day",
x = "Time of the day (hours)",
y = "Difference in download speed (Mbps)",
fill = "Connection Type",
caption = "B. Alako: SamKnows Data analyst assessment test")
ggplotly(difference_download_by_isp_plot, tooltip = "text")
Comments